
library(tidyverse)
library(caret)
library(ggridges)
source("imv_jcss.R")

df_2015 <- read.csv("2015_2016_transfer_essay_topics_liwc.csv")
df_2016 <- read.csv("2016_2017_transfer_essay_topics_liwc.csv")
df_all <- read.csv("all_transfer_essay_topics_liwc.csv")

df_2016 <- df_2016 %>%
  mutate(Type = ifelse(Type == "Vert", 1,0))

df_2015 <- df_2015 %>%
  mutate(Type = ifelse(Type == "Vert", 1,0))

df_all <- df_all %>%
  mutate(Type = ifelse(Type == "Vert", 1,0))


### 2015 essays

liwc_2015 <- df_2015[,33:124]
liwc_2015 <- liwc_2015 %>%
  select(-Dash)

topics_2015 <- log(df_2015[,3:32])
# Remove least correlated topic
topics_2015 <- topics_2015 %>%
  select(-Communication_Design)

df_2015_imv <- cbind(liwc_2015,
                     topics_2015,
                     Type = factor(df_2015$Type))

tmp <- glm(Type ~ ., data = df_2015_imv,
           family = "binomial")


preds <- predict(tmp, type = "response")

actual <- df_2015_imv$Type

preds_list <- ifelse(preds >= .5,1,0)

mean(preds_list == actual)

imv_df_2 <- data.frame(x = as.matrix(cbind(liwc_2015,topics_2015)), 
                       resp = df_2015$Type,
                       pv2 = preds, 
                       pv1 = 0.9178506)

imv(imv_df_2)

# This will take a while to run. Load the data file below instead.
#undersamp_folds_2015 <- trainControl(method = "repeatedcv",
#                                     sampling = "down",
#                                     number = 15,
#                                     repeats = 100)


#undersamp_cv_2015 <- train(Type ~ ., method = "glm", family = "binomial",
#                           data = df_2015_imv, trControl = undersamp_folds_2015)

load("undersamp_folds_2015.RDS")
load("undersamp_cv_2015.RDS")
print(undersamp_cv_2015)

### 2016 essays

liwc_2016 <- df_2016[,33:124]
liwc_2016 <- liwc_2016 %>%
  select(-Dash)

topics_2016 <- log(df_2016[,3:32])
# Drop least correlated topic
topics_2016 <- topics_2016 %>%
  select(-Self.assessment)

df_2016_imv <- cbind(liwc_2016,
                     topics_2016,
                     Type = factor(df_2016$Type))

tmp_glm_2016 <- glm(Type ~ ., data = df_2016_imv,
                    family = "binomial")

preds_2016 <- predict(tmp_glm_2016, type = "response")

actual_2016 <- df_2016_imv$Type

preds_list_2016 <- ifelse(preds >= .5,1,0)

mean(preds_list_2016 == actual_2016)

imv_df_2016 <- data.frame(x = as.matrix(cbind(liwc_2016,topics_2016)), 
                          resp = df_2016$Type,
                          pv2 = preds_2016, 
                          pv1 = 0.923452)

imv(imv_df_2016)

# See note above
#undersamp_folds_2016 <- trainControl(method = "repeatedcv",
#                                     sampling = "down",
#                                     number = 15,
#                                     repeats = 100)

#undersamp_cv_2016 <- train(Type ~ ., method = "glm", family = "binomial",
#                           data = df_2016_imv, trControl = undersamp_folds_2016)

load("undersamp_folds_2016.RDS")
load("undersamp_cv_2016.RDS")
print(undersamp_cv_2016)

### All

liwc_all <- df_all[,53:144]
liwc_all <- liwc_all %>%
  select(-Dash)

topics_all <- log(df_all[,3:52])
# Drop least correlated topic
topics_all <- topics_all %>%
  select(-Degree.attainment)

df_all_imv <- cbind(liwc_all,
                    topics_all,
                    Type = factor(df_all$Type))

tmp_glm_all <- glm(Type ~ ., data = df_all_imv,
                   family = "binomial")

preds_all <- predict(tmp_glm_all, type = "response")

actual_all <- df_all_imv$Type

preds_list_all <- ifelse(preds_all >= .5,1,0)

mean(preds_list_all == actual_all)

imv_df_all <- data.frame(x = as.matrix(cbind(liwc_all,topics_all)), 
                         resp = df_all$Type,
                         pv2 = preds_all, 
                         pv1 = 0.9206372)

imv(imv_df_all)

# See note above
#undersamp_folds_all <- trainControl(method = "repeatedcv",
#                                    sampling = "down",
#                                    number = 15,
#                                    repeats = 100)

#undersamp_cv_all <- train(Type ~ ., method = "glm", family = "binomial",
#                          data = df_all_imv, trControl = undersamp_folds_all)

load("undersamp_folds_all.RDS")
load("undersamp_cv_all.RDS")
print(undersamp_cv_all)

### Visual
undersamp_preds_2016 <- data.frame(Accuracy = undersamp_cv_2016$resample$Accuracy,
                                   Year = "2016")

undersamp_preds_2015 <- data.frame(Accuracy = undersamp_cv_2015$resample$Accuracy,
                                   Year = "2015")

undersamp_preds_all <- data.frame(Accuracy = undersamp_cv_all$resample$Accuracy,
                                  Year = "Both")

undersamp_preds_df <- rbind(undersamp_preds_2015,
                            undersamp_preds_2016,
                            undersamp_preds_all)

pdf("acc_by_year_figur.pdf", width = 8)
ggplot(undersamp_preds_df, aes(x= Accuracy, y = Year)) +
  geom_density_ridges(aes(fill = Year, color = Accuracy),
                      quantile_lines = TRUE,
                      quantile_fun = function(x,...)mean(x),
                      rel_min_height = .005) +
  scale_fill_manual(values = c("red2", "gold", "turquoise2")) +
  scale_x_continuous(labels = c("","60%", "70%", "80%")) +
  theme_ridges(center_axis_labels = TRUE,
               font_size = 16) +
  theme(legend.position = "none")

dev.off()
